library(stats)
library(ggplot2)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stats)
library(ggplot2)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
earnings_df <-read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_quarterly_earnings_data.csv')
head(earnings_df) date education_level median_weekly_earnings
1 2019-01-01 ba_only 1213
2 2019-04-01 ba_only 1236
3 2019-07-01 ba_only 1281
4 2019-10-01 ba_only 1259
5 2020-01-01 ba_only 1263
6 2020-04-01 ba_only 1303
tuition_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_tuition_data.csv')
head(tuition_df) Academic.Year average_tuition degree_granting year_parsed
1 2019-20 15747 Non-Degree Granting 2019-01-01
2 2019-20 25447 Degree Granting 2019-01-01
3 2020-21 26084 Degree Granting 2020-01-01
4 2020-21 15755 Non-Degree Granting 2020-01-01
5 2021-22 26572 Degree Granting 2021-01-01
6 2021-22 15978 Non-Degree Granting 2021-01-01
public_or_private
1 public
2 public
3 public
4 public
5 public
6 public
unemp_emp_ratio_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_monthly_unemployment_&_employment_ratio_data.csv')
head(unemp_emp_ratio_df ) date education_level employment_population_ratio unemployment_rate
1 2019-01-01 some_college_assoc 63.3 3.5
2 2019-02-01 some_college_assoc 63.4 3.2
3 2019-03-01 some_college_assoc 63.0 3.4
4 2019-04-01 some_college_assoc 62.8 3.1
5 2019-05-01 some_college_assoc 63.6 2.7
6 2019-06-01 some_college_assoc 62.8 3.0
# Correct data types before ANOVA
earnings_df <- earnings_df %>%
mutate(
year = factor(format(as.Date(date), "%Y")),
education_level = factor(education_level)
)
tuition_df <- tuition_df |> mutate(year = factor(format(as.Date(year_parsed), "%Y")))library(ggplot2)
ggplot(earnings_df, aes(
x = education_level,
y = median_weekly_earnings,
fill = education_level
)) +
geom_boxplot(alpha = 0.85, outlier.color = "gray40") +
scale_fill_brewer(palette = "Set2") + # cleaner, professional palette
labs(
title = "Median Weekly Earnings by Education Level",
x = "Education Level",
y = "Median Weekly Earnings"
) +
theme_minimal(base_size = 16) + # larger base text for slides
theme(
text = element_text(family = "Helvetica", face = "bold"),
plot.title = element_text(size = 22, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"),
axis.text.x = element_text(size = 14, face = "bold", angle = 20, hjust = 1),
axis.text.y = element_text(size = 14, face = "bold"),
legend.position = "none", # removes redundant legend
panel.grid.major.x = element_blank() # cleaner slide aesthetic
)Check for equal variances
library(car)Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
leveneTest(median_weekly_earnings ~ education_level, data = earnings_df)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 2 6.9734 0.001669 **
75
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(median_weekly_earnings ~ education_level * year, data = earnings_df)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 20 1.2631 0.2415
57
oneway <- aov(median_weekly_earnings ~ education_level, data = earnings_df)
summary(oneway) Df Sum Sq Mean Sq F value Pr(>F)
education_level 2 4505007 2252503 264.3 <2e-16 ***
Residuals 75 639267 8524
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The one-way ANOVA test revealed a highly significant effect of education level on median weekly earnings. This indicates that mean earnings differ substantially across the three education groups.
# Welch's ANOVA
oneway.test(median_weekly_earnings ~ education_level, data = earnings_df)
One-way analysis of means (not assuming equal variances)
data: median_weekly_earnings and education_level
F = 207.99, num df = 2.000, denom df = 48.517, p-value < 2.2e-16
model_two_way <- aov(median_weekly_earnings ~ education_level * year, data = earnings_df)
summary(model_two_way) Df Sum Sq Mean Sq F value Pr(>F)
education_level 2 4505007 2252503 4339.900 < 2e-16 ***
year 6 578857 96476 185.881 < 2e-16 ***
education_level:year 12 30826 2569 4.949 1.54e-05 ***
Residuals 57 29584 519
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Bachelor’s degree holders consistently earn much more than those with some college/associate degrees, who in turn earn more than those with only a high school diploma.
#Games-Howell Test
library(rstatix)
Attaching package: 'rstatix'
The following object is masked from 'package:stats':
filter
earnings_df |> games_howell_test(median_weekly_earnings ~ education_level)# A tibble: 3 × 8
.y. group1 group2 estimate conf.low conf.high p.adj p.adj.signif
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 median_weekly… ba_on… high_… -559. -625. -492. 1.03e-12 ****
2 median_weekly… ba_on… some_… -440. -507. -372. 1.04e-12 ****
3 median_weekly… high_… some_… 119. 68.1 170. 2.37e- 6 ****
limited_earnings_df <- earnings_df[earnings_df$education_level != "high_school",]
limited_earnings_df <- limited_earnings_df |>
mutate(degree_category = case_when(
education_level == "ba_only" ~ "Bachelor's",education_level == "some_college_assoc" ~ "Vocational",
TRUE ~ "NA"
))
tuition_df <- tuition_df |>
mutate(degree_category = case_when(
degree_granting == "Degree Granting" ~ "Bachelor's",
degree_granting == "Non-Degree Granting" ~ "Vocational",
TRUE ~ "NA"
))Join Dataframes together to assess ROI
temp_earnings_df <- limited_earnings_df |> select(c("median_weekly_earnings","year","degree_category"))
temp_tuition_df <- tuition_df |> select(c("average_tuition","year","degree_category"))
merged_df <- merge(temp_earnings_df, temp_tuition_df, by = c("degree_category","year"))
roi_df <- merged_df |> mutate( ROI = median_weekly_earnings / average_tuition )
roi_yearly <- roi_df %>%
group_by(degree_category, year) %>%
summarize(ROI = mean(ROI), .groups = "drop") %>%
arrange(degree_category, year)
roi_clean <- merged_df |>
dplyr::group_by(degree_category, year) |>
dplyr::summarise(
annual_earnings = mean(median_weekly_earnings) * 52,
tuition = mean(average_tuition),
ROI = annual_earnings / tuition, # annualized ROI
.groups = "drop"
)library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
plot <- plot_ly(roi_clean, x = ~year) %>%
add_trace(
y = ~ROI,
color = ~degree_category,
colors = c("Bachelor's" = "#E64B35", "Vocational" = "#4DBBD5"),
type = "scatter",
mode = "lines+markers",
name = ~paste(degree_category, "ROI"),
yaxis = "y"
) %>%
add_trace(
y = ~annual_earnings,
x = ~year,
split = ~degree_category,
type = "bar",
name = ~paste(degree_category, "Earnings"),
opacity = 0.4,
yaxis = "y2",
showlegend = TRUE
) %>%
add_trace(
y = ~tuition,
x = ~year,
split = ~degree_category,
type = "bar",
name = ~paste(degree_category, "Tuition"),
opacity = 0.4,
yaxis = "y2",
showlegend = TRUE
) %>%
layout(
title = "ROI with Earnings and Tuition Over Time",
barmode = "group",
xaxis = list(title = "Year"),
yaxis = list(
title = "ROI (Earnings / Tuition)",
rangemode = "tozero"
),
yaxis2 = list(
title = "Dollars (Earnings & Tuition)",
overlaying = "y",
side = "right",
rangemode = "tozero"
),
legend = list(title = list(text = "<b>Series</b>"))
)
# htmlwidgets::saveWidget(plot, "roi_plot.html")
plotvocational_df <- unemp_emp_ratio_df[unemp_emp_ratio_df$education_level == "some_college_assoc",]
vocational_df$date<-as.Date(vocational_df$date,"%Y-%m-%d")
vocational_df$employment_population_ratio <-as.numeric(vocational_df$employment_population_ratio )
fig <- plot_ly(vocational_df, x = ~date, y = ~employment_population_ratio, type = 'scatter', mode = 'lines')
fig <- fig %>% layout(title = "Employment Population Ratio from 2019 to 2025")
fig